Load libraries
This project uses renv to keep track of installed packages. Install renv if not installed and load dependencies with renv::restore().
install.packages("renv")
renv::restore()
Read data
- Get list of samples
Code
samples <- read_tsv("config/samples.tsv", show_col_types = FALSE)
units <- read_tsv("config/units.tsv", show_col_types = FALSE)
sample_units <- dplyr::left_join(samples, units, by = "sample_name") %>%
unite(sample_unit, sample_name, unit_name, remove = FALSE)
sample_units
- Read Samtools
idxstats to get human coverage for normalization
Notes:
- The counts include the total number of reads aligned, they are not limited to uniquely aligned reads.
- The counts are reads, not pairs or fragments
Code
idxstats_exogenousrna_dir <-
"results/samtools_idxstats/exogenous_rna/"
idxstats_human_dir <-
"results/samtools_idxstats/Homo_sapiens.GRCh38.dna.primary_assembly/"
bowtie2_human_logs <-
"results/logs/bowtie2/Homo_sapiens.GRCh38.dna.primary_assembly/"
idxstats <- tibble()
for (row in seq_len(nrow(sample_units))) {
sample <- sample_units[row, ]$sample_unit
# Read `idsxstats` for exogenous mapped reads
exogenous_rna_stats <- read_tsv(
file.path(idxstats_exogenousrna_dir, sprintf("%s.bam.idxstats", sample)),
col_names = c(
"sequence_name", "sequence_length",
"mapped_reads", "unmapped_reads"
),
col_types = "ciii"
)
exogenous_rna_mapped_reads <- exogenous_rna_stats %>%
filter(!sequence_name %in% c("*")) %>%
select(sequence_name, mapped_reads) %>%
mutate(sample = sample)
# Read `idxstats` for human mapped reads
human_stats <- read_tsv(
file.path(idxstats_human_dir, sprintf("%s.bam.idxstats", sample)),
col_names = c(
"sequence_name", "sequence_length",
"mapped_reads", "unmapped_reads"
),
col_types = "ciii"
)
grch38_mapped_reads <- human_stats %>%
filter(!sequence_name %in% c("*")) %>%
select(mapped_reads) %>%
sum()
grch38_mapped_reads <- tibble(
sequence_name = "grch38_mapped_reads",
mapped_reads = grch38_mapped_reads,
sample = sample
)
# Read bowtie2 logs for unmapped reads
bowtie2_log <- readLines(
file.path(bowtie2_human_logs, sprintf("%s.log", sample))
)
total_pairs <- strtoi(str_split(bowtie2_log[1], " ")[[1]][1])
total_reads <- total_pairs * 2
unmapped_reads <- tibble(
sequence_name = "unmapped",
mapped_reads = total_reads - grch38_mapped_reads$mapped_reads,
sample = sample
)
# Consolidate counts for rows
idxstats <- rbind(
idxstats,
exogenous_rna_mapped_reads,
grch38_mapped_reads,
unmapped_reads
)
}
idxstats
Coverage
Concordant vs Discordant paired reads
Concordant pairs are pairs of reads that:
- Align on the same pegRNA
- Align within 500 bp of each other
- Align in the expected forward-reverse orientation (
--> .. <--)
Discordant reads aligned but whose mate:
- Did not align (on the pegRNA)
- Aligned more than 500 bp away
- Aligned in an unexpected orientation
Code
## Config and function definition
bam_dir <- "results/alignments/exogenous_rna/sorted"
last_day <- 0
cols <- brewer.pal(n = 5, name = "RdBu")
concordant_cell_line_colors <- list(
"Parental" = "#CA0020",
"P1E10" = "#0571B0"
)
discordant_cell_line_colors <- list(
"Parental" = "#F4A582",
"P1E10" = "#92C5DE"
)
# Exogenous RNA mixtures
rna_mixes <- tibble()
for (mix in c("mastermix1", "mastermix2", "PJY103", "PJY300")) {
t <- readDNAStringSet(sprintf("data/references/%s.fa", mix))
rna_mixes <- rbind(rna_mixes, tibble(
exogenous_rna = mix,
rna_species = word(t@ranges@NAMES, 1),
start = 1,
end = t@ranges@width
))
}
rna_mixes <- rna_mixes %>%
rows_update(tibble(exogenous_rna = "PJY103", start = 331, end = 460),
by = "exogenous_rna"
) %>%
rows_update(tibble(exogenous_rna = "PJY300", start = 331, end = 497),
by = "exogenous_rna"
)
rna_mixes
Code
PJY103 / PJY300 (Batch 2)
Code
sample_list <- rna_mixes %>% filter(grepl("PJY103|PJY300", rna_species))
for (start_offset in c(NA, 5)) {
for (norm_factor in c("exogenous_rna_mapped_reads", "grch38_mapped_reads")) {
for (i in seq_len(nrow(sample_list))) {
rna_species <- sample_list[[i, "rna_species"]]
mix <- sample_list[[i, "exogenous_rna"]]
pegrna_plots(
sequence_name = rna_species,
normalization_factor = norm_factor,
mix = mix,
ylab = sprintf("%s coverage (normalized to %s)", mix, norm_factor),
vlines = seq(sample_list[i, ]$start, sample_list[i, ]$end, 2),
start_offset = start_offset
)
}
}
}
VEGFA
Code
sample_list <- rna_mixes %>% filter(grepl("VEGFA", rna_species))
for (start_offset in c(NA, 5)) {
for (norm_factor in c("exogenous_rna_mapped_reads", "grch38_mapped_reads")) {
for (i in seq_len(nrow(sample_list))) {
rna_species <- sample_list[[i, "rna_species"]]
mix <- sample_list[[i, "exogenous_rna"]]
pegrna_plots(
sequence_name = rna_species,
normalization_factor = norm_factor,
mix = mix,
ylab = sprintf("%s coverage (normalized to %s)", mix, norm_factor),
vlines = seq(sample_list[i, ]$start, sample_list[i, ]$end, 2),
start_offset = start_offset
)
}
}
}
FANCF
Code
sample_list <- rna_mixes %>% filter(grepl("FANCF", rna_species))
for (start_offset in c(NA, 5)) {
for (norm_factor in c("exogenous_rna_mapped_reads", "grch38_mapped_reads")) {
for (i in seq_len(nrow(sample_list))) {
rna_species <- sample_list[i, ]$rna_species
pegrna_plots(
sequence_name = rna_species,
normalization_factor = norm_factor,
ylab = sprintf("Coverage (normalized to %s)", norm_factor),
vlines = seq(sample_list[i, ]$start, sample_list[i, ]$end, 2),
start_offset = start_offset
)
}
}
}
HEK3
Code
sample_list <- rna_mixes %>% filter(grepl("HEK3", rna_species))
for (start_offset in c(NA, 5)) {
for (norm_factor in c("exogenous_rna_mapped_reads", "grch38_mapped_reads")) {
for (i in seq_len(nrow(sample_list))) {
rna_species <- sample_list[i, ]$rna_species
pegrna_plots(
sequence_name = rna_species,
normalization_factor = norm_factor,
ylab = sprintf("Coverage (normalized to %s)", norm_factor),
vlines = seq(sample_list[i, ]$start, sample_list[i, ]$end, 2),
start_offset = start_offset
)
}
}
}
DNMT1
Code
sample_list <- rna_mixes %>% filter(grepl("DNMT1", rna_species))
for (start_offset in c(NA, 5)) {
for (norm_factor in c("exogenous_rna_mapped_reads", "grch38_mapped_reads")) {
for (i in seq_len(nrow(sample_list))) {
rna_species <- sample_list[i, ]$rna_species
pegrna_plots(
sequence_name = rna_species,
normalization_factor = norm_factor,
ylab = sprintf("Coverage (normalized to %s)", norm_factor),
vlines = seq(sample_list[i, ]$start, sample_list[i, ]$end, 2),
start_offset = start_offset
)
}
}
}
RUNX1
Code
sample_list <- rna_mixes %>% filter(grepl("RUNX1", rna_species))
for (start_offset in c(NA, 5)) {
for (norm_factor in c("exogenous_rna_mapped_reads", "grch38_mapped_reads")) {
for (i in seq_len(nrow(sample_list))) {
rna_species <- sample_list[i, ]$rna_species
pegrna_plots(
sequence_name = rna_species,
normalization_factor = norm_factor,
ylab = sprintf("Coverage (normalized to %s)", norm_factor),
vlines = seq(sample_list[i, ]$start, sample_list[i, ]$end, 2),
start_offset = start_offset
)
}
}
}
EMX1
Code
sample_list <- rna_mixes %>% filter(grepl("EMX1", rna_species))
for (start_offset in c(NA, 5)) {
for (norm_factor in c("exogenous_rna_mapped_reads", "grch38_mapped_reads")) {
for (i in seq_len(nrow(sample_list))) {
rna_species <- sample_list[i, ]$rna_species
pegrna_plots(
sequence_name = rna_species,
normalization_factor = norm_factor,
ylab = sprintf("Coverage (normalized to %s)", norm_factor),
vlines = seq(sample_list[i, ]$start, sample_list[i, ]$end, 2),
start_offset = start_offset
)
}
}
}
RNF2
Code
sample_list <- rna_mixes %>% filter(grepl("RNF2", rna_species))
for (start_offset in c(NA, 5)) {
for (norm_factor in c("exogenous_rna_mapped_reads", "grch38_mapped_reads")) {
for (i in seq_len(nrow(sample_list))) {
rna_species <- sample_list[i, ]$rna_species
pegrna_plots(
sequence_name = rna_species,
normalization_factor = norm_factor,
ylab = sprintf("Coverage (normalized to %s)", norm_factor),
vlines = seq(sample_list[i, ]$start, sample_list[i, ]$end, 2),
start_offset = start_offset
)
}
}
}
Strandedness
Code
source("pegrna_alignment_strandedness.R")
for (rna_species in rna_mixes %>%
select(rna_species) %>%
unique() %>%
pull()) {
stranded_counts <- pegrna_alignment_strandedness(sequence_name = rna_species)
p <- ggplot(
data = stranded_counts,
aes(x = sample_unit, y = count, fill = strand)
) +
geom_bar(stat = "identity") +
labs(title = rna_species) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
print(p)
}